rm(list = ls(all = TRUE))
library(reshape2)
library(ggplot2)


# define number of topics
N.topics <- 22
no.words <- 20

### PATHS ##############################
gamma.file <- paste0("./data/",N.topics,"-topics_100_iterations_dtm_output/lda-seq/gam.dat")
label.file <- paste0("./data/matches_DTM_model_",N.topics,"_topics_top_", no.words, "_words.csv")
seq.file <- "./data/data-seq.dat"
plots.path <- "./plots"
########################################

# read sequence file to calculate number of sessions and number of docs per session 
seqfile <- read.table(seq.file)

N.sessions <- nrow(seqfile)-1

docs.by.session <- seqfile$V1[2:nrow(seqfile)]
sessions.unique <- seq(1, N.sessions)

session <- rep(sessions.unique, docs.by.session)


# load estimated gammas
gamma <- scan(gamma.file)

# calculate expected topic mixtures for each document by dividing the gammas associated with each document by the sum for each document
b <- matrix(gamma, ncol=N.topics, byrow=TRUE)
rs <- rowSums(b)
topic.mixtures <- b / rs
colnames(topic.mixtures) <- paste0("topic", seq(1, N.topics))

# combine expected topic mixtures with session information
data.wide <- data.frame(cbind(session, topic.mixtures))

# create unique ID
data.wide$id <- seq(1, nrow(data.wide))

# clean-up to free memory
rm(topic.mixtures)

# convert from wide to long format
id.vars <- names(data.wide)[!(names(data.wide) %in% paste0("topic", seq(1, N.topics)))]
data <- melt(data.wide, id=id.vars)
names(data) <- c(id.vars, "topic", "prop")

# add years
# Note: each session corresponds to one year, but 2010-2012 was one session
data$year <- data$session + 1934
data$year[data$year>=2011] <- data$year[data$year>=2011] + 1

    
# calculate average topic proportions in each year
a <- aggregate(data$prop, by=list(data$year, data$topic), mean)
names(a) <- c("year", "topic", "avg")

# add topic labels from policy agenda matching
labels <- read.csv(label.file, stringsAsFactors=FALSE)

# nicer labels
labels$pa.label.jaccard <- factor(labels$pa.label.jaccard)

levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="macroeconomics"] <- "Macroeconomics"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="civil rights and minority issues"] <- "Civil Rights and Minority Issues"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="health"] <- "Health"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="agriculture"] <- "Agriculture"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="labor and employment"] <- "Labor and Employment"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="education and culture"] <- "Education and Culture"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="environment"] <- "Environment"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="energy"] <- "Energy"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="transportation"] <- "Transportation"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="law crime and family issues"] <- "Law, Crime, and Family Issues"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="social welfare"] <- "Social Welfare"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="community development planning and housing"] <- "Community Development,\nPlanning and Housing"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="banking finance and domestic commerce"] <- "Banking, Finance, and\nDomestic Commerce"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="defense"] <- "Defense"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="space science technology and communications"] <- "Space, Science, Technology and Communications"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="foreign trade"] <- "Foreign Trade"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="international affairs and foreign aid"] <- "International Affairs\nand Foreign Aid"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="government operations"] <- "Government Operations"
levels(labels$pa.label.jaccard)[levels(labels$pa.label.jaccard)=="public lands water management colonial"] <- "Public Lands, Water\nManagement, Colonial"


## combine labels with jaccard value
labels$topic.label <- paste0(labels$pa.label.jaccard, " (",labels$jaccard.value,")")
labels$topic.label[duplicated(labels$topic.label)] <- paste(labels$topic.label[duplicated(labels$topic.label)],"2")

a$topic.number <- as.numeric(a$topic)-1

a <- merge(a, labels, by.x="topic.number", by.y="dtm.topic.number")
a <- a[order(a$year, a$topic),]

# calculate average proportions
total.props <- aggregate(a$avg, list(a$topic), mean)
names(total.props) <- c("topic", "avg.prop")

# sort topic labels by jaccard value
tmp <- a[!duplicated(a$topic.label), c("topic.label", "jaccard.value")]
tmp <- tmp[order(tmp$jaccard.value, decreasing=TRUE),]

a$topic.label <- factor(a$topic.label, level=tmp$topic.label)


# PLOTS
# -----
# Time series plot of topic distributions
x.labels <- sort(unique(a$year))
x.labels[!(x.labels %in% seq(1935,2013,5))] <- NA
x.labels <- as.character(x.labels)
x.labels[is.na(x.labels)] <- ""

p <- ggplot(a, aes(x=year, y=avg, group=topic.label)) +
    theme_bw() +
    facet_wrap(~ topic.label, ncol=3) +
    geom_line(size=1) +
    scale_x_continuous(
        breaks = sort(unique(a$year)),
        labels = x.labels
        ) +
    theme(axis.text.x = element_text(angle=45, size=7, hjust=1)) +
    xlab("") +
    ylab("Topic Proportion") +
    theme(plot.title = element_text(hjust = 0.5)) + # center
    theme(plot.title = element_text(lineheight=.8, face="bold", size=16, vjust=1.5))

fname <- file.path(plots.path, "topic_proportions_over_time.pdf")
pdf(fname, width=8, height=12)
 print(p)
dev.off()

